This Data Challenge is inspired by a prediction competition in Kaggle – Natural Language Processing with Disaster Tweets. The goal for this challenge is to build various machine learning models that predicts which Tweets are about real disasters and which ones are not by predicting (1) for disaster and (0) for non-disaster. For the purpose of this comparisson experiments Natural Language Processing will be applied.
The train dataset that is going to be used consist of 7613 tweets that were hand classified. The columns in both the train and test csv files are “id” - a unique identifier for each tweet, “keyword” - a particular keyword from the tweet (may be blank), “location” - the location the tweet was sent from (may be blank), “text” - the text of the tweet. Furthermore, the train dataset has a column “target” which designate whether a tweet is about a real disaster (1) or not (0).
# !pip install geopy
# !pip install folium
# !pip install transformers
# !pip install ipywidgets
# !pip install widgetsnbextension
# nltk.download('wordnet')
# import nltk
# nltk.download('punkt')
download glove.6B.100d.txt file
import numpy as np
import pandas as pd
#sklearn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn import feature_extraction, linear_model, model_selection, preprocessing, metrics
from sklearn.metrics import (precision_score, recall_score, f1_score, classification_report, accuracy_score)
# text processing
import re
import string
import nltk
from nltk import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
# visualizations
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from plotly import graph_objs as go
import plotly.figure_factory as ff
import plotly.offline as py
# map
import folium
from folium import plugins
import plotly.figure_factory as ff
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
# Tensorflow & Keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, Callback, ReduceLROnPlateau
from tensorflow.keras.layers import (LSTM, Embedding, BatchNormalization, Dense, TimeDistributed, Dropout,
Bidirectional, Flatten, GlobalMaxPool1D, Input, GlobalAveragePooling1D)
# BERT
import transformers
from transformers import TFBertModel, BertTokenizer
from tokenizers import BertWordPieceTokenizer
from tqdm.notebook import tqdm
# XGBoost
import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
df_train = pd.read_csv("./data/train.csv")
df_test = pd.read_csv("./data/test.csv")
df_train
| id | keyword | location | text | target | |
|---|---|---|---|---|---|
| 0 | 1 | earthquake | NaN | Our Deeds are the Reason of this #earthquake M... | 1 |
| 1 | 4 | forest%20fire | Canada | Forest fire near La Ronge Sask. Canada | 1 |
| 2 | 5 | evacuation | NaN | All residents asked to 'shelter in place' are ... | 1 |
| 3 | 6 | wildfires | California, USA | 13,000 people receive #wildfires evacuation or... | 1 |
| 4 | 7 | wildfires | Alaska, USA | Just got sent this photo from Ruby #Alaska as ... | 1 |
| ... | ... | ... | ... | ... | ... |
| 7608 | 10869 | bridge%20collapse | NaN | Two giant cranes holding a bridge collapse int... | 1 |
| 7609 | 10870 | wildfire | California | @aria_ahrary @TheTawniest The out of control w... | 1 |
| 7610 | 10871 | volcano | Hawaii, USA | M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt... | 1 |
| 7611 | 10872 | collided | Portugal | Police investigating after an e-bike collided ... | 1 |
| 7612 | 10873 | wildfire | California | The Latest: More Homes Razed by Northern Calif... | 1 |
7613 rows × 5 columns
#URL
pattern = r'(https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}[-a-zA-Z0-9()@:%_+.~#?&/=]*)'
df_train['URL'] = df_train.text.str.extract(pattern, expand=True)
print(df_train)
id keyword location \
0 1 earthquake NaN
1 4 forest%20fire Canada
2 5 evacuation NaN
3 6 wildfires California, USA
4 7 wildfires Alaska, USA
... ... ... ...
7608 10869 bridge%20collapse NaN
7609 10870 wildfire California
7610 10871 volcano Hawaii, USA
7611 10872 collided Portugal
7612 10873 wildfire California
text target \
0 Our Deeds are the Reason of this #earthquake M... 1
1 Forest fire near La Ronge Sask. Canada 1
2 All residents asked to 'shelter in place' are ... 1
3 13,000 people receive #wildfires evacuation or... 1
4 Just got sent this photo from Ruby #Alaska as ... 1
... ... ...
7608 Two giant cranes holding a bridge collapse int... 1
7609 @aria_ahrary @TheTawniest The out of control w... 1
7610 M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt... 1
7611 Police investigating after an e-bike collided ... 1
7612 The Latest: More Homes Razed by Northern Calif... 1
URL
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN
... ...
7608 http://t.co/STfMbbZFB5
7609 NaN
7610 http://t.co/zDtoyd8EbJ
7611 NaN
7612 http://t.co/YmY4rSkQ3d
[7613 rows x 6 columns]
# numbers
p = r'([0-9]+)'
df_train['number'] = df_train.text.str.extract(p, expand=True)
print(df_train)
id keyword location \
0 1 earthquake NaN
1 4 forest%20fire Canada
2 5 evacuation NaN
3 6 wildfires California, USA
4 7 wildfires Alaska, USA
... ... ... ...
7608 10869 bridge%20collapse NaN
7609 10870 wildfire California
7610 10871 volcano Hawaii, USA
7611 10872 collided Portugal
7612 10873 wildfire California
text target \
0 Our Deeds are the Reason of this #earthquake M... 1
1 Forest fire near La Ronge Sask. Canada 1
2 All residents asked to 'shelter in place' are ... 1
3 13,000 people receive #wildfires evacuation or... 1
4 Just got sent this photo from Ruby #Alaska as ... 1
... ... ...
7608 Two giant cranes holding a bridge collapse int... 1
7609 @aria_ahrary @TheTawniest The out of control w... 1
7610 M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt... 1
7611 Police investigating after an e-bike collided ... 1
7612 The Latest: More Homes Razed by Northern Calif... 1
URL number
0 NaN NaN
1 NaN NaN
2 NaN NaN
3 NaN 13
4 NaN NaN
... ... ...
7608 http://t.co/STfMbbZFB5 5
7609 NaN NaN
7610 http://t.co/zDtoyd8EbJ 1
7611 NaN NaN
7612 http://t.co/YmY4rSkQ3d 4
[7613 rows x 7 columns]
df_train.sample(10)
| id | keyword | location | text | target | URL | number | |
|---|---|---|---|---|---|---|---|
| 6677 | 9568 | thunder | London | Okay maybe not as extreme as thunder and light... | 0 | https://t.co/ETuuYISLHw | 16 |
| 4516 | 6417 | hurricane | NaN | AngelRiveraLibÛ_ #Snowden 'may have' broken l... | 1 | http://t.co/jAaWuiOvdc | NaN |
| 3218 | 4617 | emergency%20services | Henderson, Nevada | Apply now to work for Dignity Health as #RN #E... | 0 | http://t.co/FDiU44jLDJ | 7 |
| 481 | 691 | attack | ph | anxiety attack ?? | 0 | NaN | NaN |
| 1844 | 2651 | crashed | Buenos Aires | MH370: Intact part lifts odds plane glided not... | 1 | http://t.co/8pdnHH6tzH | 370 |
| 5815 | 8298 | rubble | NaN | My parents are so impulsive sometimes. I remem... | 0 | NaN | NaN |
| 3243 | 4659 | engulfed | Kuwait | He came to a land which was engulfed in tribal... | 1 | NaN | NaN |
| 3228 | 4632 | emergency%20services | Sydney, New South Wales | Goulburn man Henry Van Bilsen missing: Emergen... | 1 | http://t.co/z99pKJzTRp | 99 |
| 6183 | 8825 | sirens | Nanaimo, BC, Canada | Photoset: hakogaku: ?åÊI am a kurd. i was born... | 1 | http://t.co/obp595W7tm | 595 |
| 2991 | 4298 | dust%20storm | Idaho | @NWSPocatello BG-16: So far brunt of storm jus... | 0 | NaN | 16 |
df_train['text'] = df_train['text'].str.split('://').str[0]
df_train['text'].iloc[76]
"'By accident' they knew what was gon happen https"
# might be replaced with ' ' a white space
df_train['keyword']=df_train['keyword'].str.replace('%20', '_')
# might be replaced with ' ' a white space
df_train['keyword']=df_train['keyword'].str.replace('%', '_')
df_train.head()
| id | keyword | location | text | target | URL | number | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | earthquake | NaN | Our Deeds are the Reason of this #earthquake M... | 1 | NaN | NaN |
| 1 | 4 | forest_fire | Canada | Forest fire near La Ronge Sask. Canada | 1 | NaN | NaN |
| 2 | 5 | evacuation | NaN | All residents asked to 'shelter in place' are ... | 1 | NaN | NaN |
| 3 | 6 | wildfires | California, USA | 13,000 people receive #wildfires evacuation or... | 1 | NaN | 13 |
| 4 | 7 | wildfires | Alaska, USA | Just got sent this photo from Ruby #Alaska as ... | 1 | NaN | NaN |
d_t = df_train[df_train['target'] == 1]['text']
for i in range(1,5):
print(d_t[i])
Forest fire near La Ronge Sask. Canada All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected 13,000 people receive #wildfires evacuation orders in California Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school
nd_t = df_train[df_train['target'] != 1]['text']
print(nd_t.head(5))
15 What's up man? 16 I love fruits 17 Summer is lovely 18 My car is so fast 19 What a goooooooaaaaaal!!!!!! Name: text, dtype: object
df_temp = df_train['target']==0
df_temp=df_train[df_train['target']==0]
loc_null_values_by_target = df_temp['location'].isnull().sum()
keyword_null_values_by_target = df_temp['keyword'].isnull().sum()
print(f"Null values for location for fake disasters: {loc_null_values_by_target}, null keywords for fake disaster: {keyword_null_values_by_target}")
Null values for location for fake disasters: 1459, null keywords for fake disaster: 0
fig = go.Figure()
fig.add_trace(go.Bar(
x=['Keyword - Fake Disasters Dataset'],
y=[keyword_null_values_by_target],
name='Keyword',
text=[keyword_null_values_by_target],
textposition='auto',
marker = {'color' : 'salmon'}
))
fig.add_trace(go.Bar(
x=['Location - Fake Disasters Dataset'],
y=[loc_null_values_by_target],
name='Location',
text=[loc_null_values_by_target],
textposition='auto',
marker = {'color' : 'cadetblue'}
))
fig.update_layout(
title='<span style="font-size:32px; font-family:Times New Roman">Missing values of keyword & location for fake disasters</span>',
)
fig.show()
loc_null_values = df_train['location'].isnull().sum()
loc_null_values_test = df_test['location'].isnull().sum()
fig = go.Figure()
fig.add_trace(go.Bar(
x=['Location - Train Dataset'],
y=[loc_null_values],
name='Location',
text=[loc_null_values],
textposition='auto',
marker = {'color' : 'cadetblue'}
))
fig.add_trace(go.Bar(
x=['Location - Test Dataset'],
y=[loc_null_values_test],
name='Location',
text=[loc_null_values_test],
textposition='auto',
marker = {'color' : 'salmon'}
))
fig.update_layout(
title='<span style="font-size:32px; font-family:Times New Roman">Missing values of location</span>',
)
fig.show()
df_train['text_len'] = df_train['text'].apply(lambda x: len(x.split(' ')))
df_train.sample(n=5)
| id | keyword | location | text | target | URL | number | text_len | |
|---|---|---|---|---|---|---|---|---|
| 7606 | 10866 | suicide_bomber | NaN | Suicide bomber kills 15 in Saudi security site... | 1 | http://t.co/nF4IculOje | 15 | 20 |
| 2722 | 3908 | devastated | FOLLOWS YOU everywhere you go | Obama Declares Disaster for Typhoon-Devastated... | 1 | http://t.co/JCszCJiHlH | NaN | 15 |
| 1837 | 2642 | crashed | too far | He was only .4 of a second faster than me and ... | 0 | NaN | 4 | 21 |
| 795 | 1153 | blight | NaN | http | 0 | http://t.co/ETkd58Un8n | 58 | 1 |
| 7412 | 10605 | wounded | NaN | Have you ever seen the President \r\nwho kille... | 0 | NaN | NaN | 24 |
balance_counts = df_train.groupby('target')['target'].agg('count').values
balance_counts
array([4343, 3270], dtype=int64)
fig = go.Figure()
fig.add_trace(go.Bar(
x=['Fake'],
y=[balance_counts[0]],
name='Fake',
text=[balance_counts[0]],
textposition='auto',
marker = {'color' : 'salmon'}
))
fig.add_trace(go.Bar(
x=['Real disaster'],
y=[balance_counts[1]],
name='Real disaster',
text=[balance_counts[1]],
textposition='auto',
marker = {'color' : 'cadetblue'}
))
fig.update_layout(
title='<span style="font-size:32px; font-family:Times New Roman">Dataset distribution by target</span>'
)
fig.show()
df_train['target_mean'] = df_train.groupby('keyword')['target'].transform('mean')
fig = plt.figure(figsize=(8, 72), dpi=100)
sns.countplot(y=df_train.sort_values(by='target_mean', ascending=False)['keyword'],
hue=df_train.sort_values(by='target_mean', ascending=False)['target'], palette='bwr')
plt.tick_params(axis='x', labelsize=15)
plt.tick_params(axis='y', labelsize=12)
plt.legend(loc=1)
plt.title('Target Distribution in Keywords')
plt.show()
df_train.drop(columns=['target_mean'], inplace=True)
fig, (ax1, ax2) = plt.subplots(1,2, figsize = [150, 50])
wc1 = WordCloud().generate(''.join(d_t))
ax1.imshow(wc1)
ax1.axis('off')
ax1.set_title('Disaster tweets', fontsize = 18)
wc2 = WordCloud().generate(''.join(nd_t))
ax2.imshow(wc2)
ax2.axis('off')
ax2.set_title('Non Disaster tweets', fontsize = 18)
Text(0.5, 1.0, 'Non Disaster tweets')
plt.figure(figsize=(9,6))
sns.countplot(y = df_train.keyword, order= df_train.keyword.value_counts().iloc[:10].index, color='salmon')
plt.title('Top 10 keywords', fontsize=13)
plt.show()
df = df_train['location'].value_counts()[:20,]
df = pd.DataFrame(df)
df = df.reset_index()
df.columns = ['location', 'counts']
geolocator = Nominatim(user_agent="DC2_NLP_Disaster Tweets")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)
dictt_latitude = {}
dictt_longitude = {}
for i in df['location'].values:
print(i)
location = geocode(i)
dictt_latitude[i] = location.latitude
dictt_longitude[i] = location.longitude
df['latitude']= df['location'].map(dictt_latitude)
df['longitude'] = df['location'].map(dictt_longitude)
USA New York United States London Canada Nigeria UK Los Angeles, CA India Mumbai Washington, DC California Kenya California, USA Worldwide Chicago, IL Australia Everywhere New York, NY United Kingdom
map1 = folium.Map(location=[10.0, 10.0], tiles='openstreetmap', zoom_start=2.3)
markers = []
for i, row in df.iterrows():
loss = row['counts']
if row['counts'] > 0:
count = row['counts']*0.4
folium.CircleMarker([float(row['latitude']), float(row['longitude'])], radius=float(count), color='#ef4f61', fill=True).add_to(map1)
map1
Make text lowercase, remove text in square brackets,remove linkemove punctuation and remove words containing numbers.
def clean_text(text):
# Convert to lower
text = text.lower()
# remove html tags
text = re.sub(r'\[.*?\]',' ', text)
# remove link
text = re.sub(r'https?://\S+|www\.\S+',' ', text)
#remove line breaks
text = re.sub(r'\n',' ',text)
#Remove trailing spaces, tabs
text = re.sub('\s+',' ',text)
# remove punctuation
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
# Remove special characters
text = re.sub('\w*\d\w*','',text)
return text
# Apply clean function on random train string
test_str = df_train.loc[417, 'text']
print('Original text: '+test_str+'\n')
print('Text after cleaning: '+clean_text(test_str))
df_train['text'] = df_train['text'].apply(clean_text)
df_train['text'].head(5)
Original text: Arson suspect linked to 30 fires caught in Northern California http://t.co/mmGsyAHDzb Text after cleaning: arson suspect linked to fires caught in northern california
0 our deeds are the reason of this earthquake ma... 1 forest fire near la ronge sask canada 2 all residents asked to shelter in place are be... 3 people receive wildfires evacuation orders in... 4 just got sent this photo from ruby alaska as s... Name: text, dtype: object
stop_words = stopwords.words('english')
more_stopwords = ['u', 'im', 'c', 'amp']
stop_words = stop_words + more_stopwords
def remove_stopwords(text):
text = ' '.join(word for word in text.split(' ') if word not in stop_words)
return text
df_train['text'] = df_train['text'].apply(remove_stopwords)
df_train.head(10)
| id | keyword | location | text | target | URL | number | text_len | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1 | earthquake | NaN | deeds reason earthquake may allah forgive us | 1 | NaN | NaN | 13 |
| 1 | 4 | forest_fire | Canada | forest fire near la ronge sask canada | 1 | NaN | NaN | 7 |
| 2 | 5 | evacuation | NaN | residents asked shelter place notified officer... | 1 | NaN | NaN | 22 |
| 3 | 6 | wildfires | California, USA | people receive wildfires evacuation orders ca... | 1 | NaN | 13 | 9 |
| 4 | 7 | wildfires | Alaska, USA | got sent photo ruby alaska smoke wildfires pou... | 1 | NaN | NaN | 17 |
| 5 | 8 | wildfires | California, USA | rockyfire update california hwy closed direc... | 1 | NaN | 20 | 18 |
| 6 | 10 | flood | USA | flood disaster heavy rain causes flash floodin... | 1 | NaN | NaN | 14 |
| 7 | 13 | fire | NaN | top hill see fire woods | 1 | NaN | NaN | 15 |
| 8 | 14 | evacuation | NaN | theres emergency evacuation happening building... | 1 | NaN | NaN | 12 |
| 9 | 15 | tornado | NaN | afraid tornado coming area | 1 | NaN | NaN | 10 |
tokenizer = RegexpTokenizer(r'\w+')
## Applying tokenization function on train sets
df_train['text'] = df_train['text'].map(tokenizer.tokenize)
## checkout train dataset tokens
df_train['text'].head(5)
0 [deeds, reason, earthquake, may, allah, forgiv... 1 [forest, fire, near, la, ronge, sask, canada] 2 [residents, asked, shelter, place, notified, o... 3 [people, receive, wildfires, evacuation, order... 4 [got, sent, photo, ruby, alaska, smoke, wildfi... Name: text, dtype: object
def lem_words(t):
l = WordNetLemmatizer()
return [l.lemmatize(w) for w in t]
df_train['text'] =df_train['text'].apply(lambda x: lem_words(x))
## checkout train dataset with lemmatized words
df_train['text'].head(5)
0 [deed, reason, earthquake, may, allah, forgive... 1 [forest, fire, near, la, ronge, sask, canada] 2 [resident, asked, shelter, place, notified, of... 3 [people, receive, wildfire, evacuation, order,... 4 [got, sent, photo, ruby, alaska, smoke, wildfi... Name: text, dtype: object
def combine_txt(t):
c = ' '.join(t)
return c
df_train['text'] =df_train['text'].apply(lambda x: combine_txt(x))
## checkout train dataset with lemmatized words
df_train['text'].head(5)
0 deed reason earthquake may allah forgive u 1 forest fire near la ronge sask canada 2 resident asked shelter place notified officer ... 3 people receive wildfire evacuation order calif... 4 got sent photo ruby alaska smoke wildfire pour... Name: text, dtype: object
tr = df_train[df_train['target'] == 1]['text']
for i in range(1,5):
print(tr[i])
forest fire near la ronge sask canada resident asked shelter place notified officer evacuation shelter place order expected people receive wildfire evacuation order california got sent photo ruby alaska smoke wildfire pours school
non = df_train[df_train['target'] != 1]['text']
print(non.head(5))
15 whats man 16 love fruit 17 summer lovely 18 car fast 19 goooooooaaaaaal Name: text, dtype: object
fig, (ax1, ax2) = plt.subplots(1,2, figsize = [150, 50])
wc1 = WordCloud().generate(''.join(tr))
ax1.imshow(wc1)
ax1.axis('off')
ax1.set_title('Disaster tweets', fontsize = 18)
wc2 = WordCloud().generate(''.join(non))
ax2.imshow(wc2)
ax2.axis('off')
ax2.set_title('Non Disaster tweets', fontsize = 18)
Text(0.5, 1.0, 'Non Disaster tweets')
try:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
strategy = tf.distribute.experimental.TPUStrategy(tpu)
except:
strategy = tf.distribute.get_strategy()
print('Number of replicas in sync: ', strategy.num_replicas_in_sync)
Number of replicas in sync: 1
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
def bert_encode(data, maximum_length) :
input_ids = []
attention_masks = []
for text in data:
encoded = tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=maximum_length,
pad_to_max_length=True,
return_attention_mask=True,
)
input_ids.append(encoded['input_ids'])
attention_masks.append(encoded['attention_mask'])
return np.array(input_ids),np.array(attention_masks)
texts = df_train['text']
target = df_train['target']
train_input_ids, train_attention_masks = bert_encode(texts,128)
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`. C:\Users\krist\AppData\Roaming\Python\Python38\site-packages\transformers\tokenization_utils_base.py:2221: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).
def create_model(bert_model):
input_ids = tf.keras.Input(shape=(60,),dtype='int32')
attention_masks = tf.keras.Input(shape=(60,),dtype='int32')
output = bert_model([input_ids,attention_masks])
output = output[1]
output = tf.keras.layers.Dense(32,activation='relu')(output)
output = tf.keras.layers.Dropout(0.2)(output)
output = tf.keras.layers.Dense(1,activation='sigmoid')(output)
model = tf.keras.models.Model(inputs = [input_ids,attention_masks],outputs = output)
model.compile(Adam(learning_rate=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
return model
bert_model = TFBertModel.from_pretrained('bert-base-uncased')
Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls'] - This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased. If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
model = create_model(bert_model)
model.summary()
Model: "model"
__________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
==================================================================================================
input_1 (InputLayer) [(None, 60)] 0
__________________________________________________________________________________________________
input_2 (InputLayer) [(None, 60)] 0
__________________________________________________________________________________________________
tf_bert_model (TFBertModel) TFBaseModelOutputWit 109482240 input_1[0][0]
input_2[0][0]
__________________________________________________________________________________________________
dense (Dense) (None, 32) 24608 tf_bert_model[0][1]
__________________________________________________________________________________________________
dropout_37 (Dropout) (None, 32) 0 dense[0][0]
__________________________________________________________________________________________________
dense_1 (Dense) (None, 1) 33 dropout_37[0][0]
==================================================================================================
Total params: 109,506,881
Trainable params: 109,506,881
Non-trainable params: 0
__________________________________________________________________________________________________
history = model.fit(
[train_input_ids, train_attention_masks],
target,
validation_split=0.2,
epochs=5,
batch_size=10
)
Epoch 1/5 609/609 [==============================] - 1736s 3s/step - loss: 0.4904 - accuracy: 0.7759 - val_loss: 0.4426 - val_accuracy: 0.8070 Epoch 2/5 609/609 [==============================] - 1265s 2s/step - loss: 0.3849 - accuracy: 0.8476 - val_loss: 0.4087 - val_accuracy: 0.8214 Epoch 3/5 609/609 [==============================] - 1273s 2s/step - loss: 0.3112 - accuracy: 0.8813 - val_loss: 0.4249 - val_accuracy: 0.8122 Epoch 4/5 609/609 [==============================] - 2946s 5s/step - loss: 0.2383 - accuracy: 0.9108 - val_loss: 0.5921 - val_accuracy: 0.7886 Epoch 5/5 609/609 [==============================] - 1290s 2s/step - loss: 0.1768 - accuracy: 0.9355 - val_loss: 0.5406 - val_accuracy: 0.8017
#accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend(['Train','Val'],loc='upper left')
plt.show()
#Loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend(['Train','Val'],loc='upper left')
plt.show()
def build_model(bert_model):
input_ids = tf.keras.Input(shape=(128,),dtype='int32')
attention_masks = tf.keras.Input(shape=(128,),dtype='int32')
output = bert_model([input_ids,attention_masks])
output = output[1]
output = tf.keras.layers.Dense(1,activation='sigmoid')(output)
model = tf.keras.models.Model(inputs = [input_ids,attention_masks],outputs = output)
optimizer = SGD(learning_rate=0.0001, momentum=0.8)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
return model
model = build_model(bert_model)
model.summary()
Model: "model_1"
__________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
==================================================================================================
input_13 (InputLayer) [(None, 128)] 0
__________________________________________________________________________________________________
input_14 (InputLayer) [(None, 128)] 0
__________________________________________________________________________________________________
tf_bert_model_1 (TFBertModel) TFBaseModelOutputWit 109482240 input_13[0][0]
input_14[0][0]
__________________________________________________________________________________________________
dense_8 (Dense) (None, 1) 769 tf_bert_model_1[7][1]
==================================================================================================
Total params: 109,483,009
Trainable params: 109,483,009
Non-trainable params: 0
__________________________________________________________________________________________________
history = model.fit(
[train_input_ids, train_attention_masks],
target,
validation_split=0.2,
epochs=10,
batch_size=32)
Epoch 1/10 191/191 [==============================] - 3749s 19s/step - loss: 0.6666 - accuracy: 0.6002 - val_loss: 0.6415 - val_accuracy: 0.6835 Epoch 2/10 191/191 [==============================] - 2804s 15s/step - loss: 0.5996 - accuracy: 0.7204 - val_loss: 0.5311 - val_accuracy: 0.7702 Epoch 3/10 191/191 [==============================] - 2806s 15s/step - loss: 0.5147 - accuracy: 0.7658 - val_loss: 0.4703 - val_accuracy: 0.7853 Epoch 4/10 191/191 [==============================] - 3442s 18s/step - loss: 0.4778 - accuracy: 0.7877 - val_loss: 0.4537 - val_accuracy: 0.7886 Epoch 5/10 191/191 [==============================] - 6074s 32s/step - loss: 0.4651 - accuracy: 0.7929 - val_loss: 0.4377 - val_accuracy: 0.8096 Epoch 6/10 191/191 [==============================] - 3521s 18s/step - loss: 0.4478 - accuracy: 0.7985 - val_loss: 0.4306 - val_accuracy: 0.8096 Epoch 7/10 191/191 [==============================] - 2743s 14s/step - loss: 0.4322 - accuracy: 0.8108 - val_loss: 0.4318 - val_accuracy: 0.8096 Epoch 8/10 191/191 [==============================] - 2762s 14s/step - loss: 0.4270 - accuracy: 0.8141 - val_loss: 0.4284 - val_accuracy: 0.8102 Epoch 9/10 20/191 [==>...........................] - ETA: 39:32 - loss: 0.4205 - accuracy: 0.8047
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) ~\AppData\Local\Temp/ipykernel_13884/1448229589.py in <module> ----> 1 history = model.fit( 2 [train_input_ids, train_attention_masks], 3 target, 4 validation_split=0.2, 5 epochs=10, ~\anaconda3\envs\general\lib\site-packages\keras\engine\training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing) 1182 _r=1): 1183 callbacks.on_train_batch_begin(step) -> 1184 tmp_logs = self.train_function(iterator) 1185 if data_handler.should_sync: 1186 context.async_wait() ~\anaconda3\envs\general\lib\site-packages\tensorflow\python\eager\def_function.py in __call__(self, *args, **kwds) 883 884 with OptionalXlaContext(self._jit_compile): --> 885 result = self._call(*args, **kwds) 886 887 new_tracing_count = self.experimental_get_tracing_count() ~\anaconda3\envs\general\lib\site-packages\tensorflow\python\eager\def_function.py in _call(self, *args, **kwds) 915 # In this case we have created variables on the first call, so we run the 916 # defunned version which is guaranteed to never create variables. --> 917 return self._stateless_fn(*args, **kwds) # pylint: disable=not-callable 918 elif self._stateful_fn is not None: 919 # Release the lock early so that multiple threads can perform the call ~\anaconda3\envs\general\lib\site-packages\tensorflow\python\eager\function.py in __call__(self, *args, **kwargs) 3037 (graph_function, 3038 filtered_flat_args) = self._maybe_define_function(args, kwargs) -> 3039 return graph_function._call_flat( 3040 filtered_flat_args, captured_inputs=graph_function.captured_inputs) # pylint: disable=protected-access 3041 ~\anaconda3\envs\general\lib\site-packages\tensorflow\python\eager\function.py in _call_flat(self, args, captured_inputs, cancellation_manager) 1961 and executing_eagerly): 1962 # No tape is watching; skip to running the function. -> 1963 return self._build_call_outputs(self._inference_function.call( 1964 ctx, args, cancellation_manager=cancellation_manager)) 1965 forward_backward = self._select_forward_and_backward_functions( ~\anaconda3\envs\general\lib\site-packages\tensorflow\python\eager\function.py in call(self, ctx, args, cancellation_manager) 589 with _InterpolateFunctionError(self): 590 if cancellation_manager is None: --> 591 outputs = execute.execute( 592 str(self.signature.name), 593 num_outputs=self._num_outputs, ~\anaconda3\envs\general\lib\site-packages\tensorflow\python\eager\execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name) 57 try: 58 ctx.ensure_initialized() ---> 59 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name, 60 inputs, attrs, num_outputs) 61 except core._NotOkStatusException as e: KeyboardInterrupt:
def build_model(bert_model):
input_ids = tf.keras.Input(shape=(128,),dtype='int32')
attention_masks = tf.keras.Input(shape=(128,),dtype='int32')
output = bert_model([input_ids,attention_masks])
output = output[1]
output = tf.keras.layers.Dense(1,activation='sigmoid')(output)
model = tf.keras.models.Model(inputs = [input_ids,attention_masks],outputs = output)
optimizer = SGD(learning_rate=0.0001, momentum=0.8)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
return model
model = build_model(bert_model)
model.summary()
Model: "model_1"
__________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
==================================================================================================
input_3 (InputLayer) [(None, 128)] 0
__________________________________________________________________________________________________
input_4 (InputLayer) [(None, 128)] 0
__________________________________________________________________________________________________
tf_bert_model (TFBertModel) multiple 109482240 input_3[0][0]
input_4[0][0]
__________________________________________________________________________________________________
dense_2 (Dense) (None, 1) 769 tf_bert_model[1][1]
==================================================================================================
Total params: 109,483,009
Trainable params: 109,483,009
Non-trainable params: 0
__________________________________________________________________________________________________
history = model.fit(
[train_input_ids, train_attention_masks],
target,
validation_split=0.2,
epochs=5,
batch_size=10)
Epoch 1/5 609/609 [==============================] - 3022s 5s/step - loss: 0.5799 - accuracy: 0.6951 - val_loss: 0.4710 - val_accuracy: 0.7899 Epoch 2/5 609/609 [==============================] - 2670s 4s/step - loss: 0.4693 - accuracy: 0.7885 - val_loss: 0.4671 - val_accuracy: 0.7859 Epoch 3/5 609/609 [==============================] - 5051s 8s/step - loss: 0.4335 - accuracy: 0.8090 - val_loss: 0.4476 - val_accuracy: 0.8011 Epoch 4/5 609/609 [==============================] - 6238s 10s/step - loss: 0.4183 - accuracy: 0.8135 - val_loss: 0.4281 - val_accuracy: 0.8181 Epoch 5/5 609/609 [==============================] - 3054s 5s/step - loss: 0.4009 - accuracy: 0.8299 - val_loss: 0.4352 - val_accuracy: 0.8109
#accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend(['Train','Val'],loc='upper left')
plt.show()
#Loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend(['Train','Val'],loc='upper left')
plt.show()
x_axes = ['Disaster', 'Non-disaster']
y_axes = ['Non-disaster', 'Disaster']
def conf_matrix(z, x=x_axes, y=y_axes):
z = np.flip(z, 0)
# change each element of z to type string for annotations
z_text = [[str(y) for y in x] for x in z]
# set up figure
fig = ff.create_annotated_heatmap(z, x=x, y=y, annotation_text=z_text, colorscale='Reds')
# add a title
fig.update_layout(title_text='<b>Confusion matrix</b>',
xaxis = dict(title='Predicted value'),
yaxis = dict(title='Real value')
)
# add colorbar
fig['data'][0]['showscale'] = True
return fig
x = df_train['text']
y = df_train['target']
# Split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)
print(len(x_train), len(y_train))
print(len(x_test), len(y_test))
5709 5709 1904 1904
pipe = Pipeline([
('bow', CountVectorizer()),
('tfid', TfidfTransformer()),
('model', xgb.XGBClassifier(
learning_rate=0.1,
max_depth=7,
n_estimators=80,
use_label_encoder=False,
eval_metric='auc',
))
])
# Fit the pipeline with the data
pipe.fit(x_train, y_train)
y_pred_class = pipe.predict(x_test)
y_pred_train = pipe.predict(x_train)
print('Train: {}'.format(metrics.accuracy_score(y_train, y_pred_train)))
print('Test: {}'.format(metrics.accuracy_score(y_test, y_pred_class)))
XGBoost_cf1 = conf_matrix(metrics.confusion_matrix(y_test, y_pred_class))
XGBoost_cf1
Train: 0.7985636713960413 Test: 0.7531512605042017
x = df_train[['text', 'URL', 'number']]
y = df_train['target']
# Split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)
print(len(x_train), len(y_train))
print(len(x_test), len(y_test))
5709 5709 1904 1904
categorical_pipeline = Pipeline(
steps=[
("impute", SimpleImputer(strategy="most_frequent")),
("oh-encode", OneHotEncoder(handle_unknown="ignore", sparse=False)),
])
numerical_pipeline = Pipeline(
steps=[("impute", SimpleImputer(strategy="mean")),
("scale", StandardScaler())])
categorical_columns = x.select_dtypes(exclude="number").columns
numercal_columns = x.select_dtypes(include="number").columns
full_processor = ColumnTransformer(
transformers=[
("numeric", numerical_pipeline, numercal_columns),
("categorical", categorical_pipeline, categorical_columns),
])
xgb_cl = xgb.XGBClassifier()
# Apply preprocessing
X_processed = full_processor.fit_transform(x)
y_processed = SimpleImputer(strategy="most_frequent").fit_transform(
y.values.reshape(-1, 1))
X_train, X_test, y_train, y_test = train_test_split(
X_processed, y_processed, stratify=y_processed, random_state=1121218)
# Init classifier
xgb_cl = xgb.XGBClassifier(objective="binary:logistic")
# Fit
xgb_cl.fit(X_train, y_train)
# Predict
preds = xgb_cl.predict(X_test)
#Score
#accuracy_score(y_test, preds)
print('Test: {}'.format(metrics.accuracy_score(y_test, preds)))
C:\Users\krist\AppData\Roaming\Python\Python38\site-packages\xgboost\sklearn.py:1224: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1]. C:\Program Files\Anaconda\lib\site-packages\sklearn\utils\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
[11:36:07] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. Test: 0.6460084033613446
XGBoost_cf2 =conf_matrix(metrics.confusion_matrix(y_test, preds))
XGBoost_cf2
X = df_train.drop(['target'], axis=1)
y = df_train['target']
cat_pipeline = Pipeline(
steps=[
("impute", SimpleImputer(strategy="most_frequent")),
("oh-encode", OneHotEncoder(handle_unknown="ignore", sparse=False)),
])
num_pipeline = Pipeline(
steps=[("impute", SimpleImputer(strategy="mean")),
("scale", StandardScaler())])
cat_columns = X.select_dtypes(exclude="number").columns
num_cols = X.select_dtypes(include="number").columns
full_procesor = ColumnTransformer(
transformers=[
("numeric", num_pipeline, num_cols),
("categorical", cat_pipeline, cat_columns),
])
xgb_cls = xgb.XGBClassifier()
print(type(xgb_cls))
<class 'xgboost.sklearn.XGBClassifier'>
# Apply preprocessing
X_processed = full_procesor.fit_transform(X)
y_processed = SimpleImputer(strategy="most_frequent").fit_transform(
y.values.reshape(-1, 1))
X_train, X_test, y_train, y_test = train_test_split(
X_processed, y_processed, stratify=y_processed, random_state=1121218)
# Init classifier
xgb_cls = xgb.XGBClassifier(objective="binary:logistic")
# Fit
xgb_cls.fit(X_train, y_train)
# Predict
pred = xgb_cls.predict(X_test)
# Score
accuracy_score(y_test, pred)
print('Test: {}'.format(metrics.accuracy_score(y_test, pred)))
C:\Users\lia\anaconda3\envs\general\lib\site-packages\xgboost\sklearn.py:1224: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1]. C:\Users\lia\anaconda3\envs\general\lib\site-packages\sklearn\preprocessing\_label.py:98: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\lia\anaconda3\envs\general\lib\site-packages\sklearn\preprocessing\_label.py:133: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
[10:28:29] WARNING: ..\src\learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. Test: 0.7279411764705882
XGBoost_cf3 =conf_matrix(metrics.confusion_matrix(y_test, pred))
XGBoost_cf3
train_tweets = df_train['text'].values
test_tweets = df_test['text'].values
train_target = df_train['target'].values
# Calculate the length of our vocabulary
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(train_tweets)
vocab_length = len(word_tokenizer.word_index) + 1
vocab_length
14434
def embed(corpus):
return word_tokenizer.texts_to_sequences(corpus)
longest_train = max(train_tweets, key=lambda sentence: len(word_tokenize(sentence)))
length_long_sentence = len(word_tokenize(longest_train))
train_padded_sentences = pad_sequences(
embed(train_tweets),
length_long_sentence,
padding='post'
)
test_padded_sentences = pad_sequences(
embed(test_tweets),
length_long_sentence,
padding='post'
)
train_padded_sentences
array([[3793, 427, 221, ..., 0, 0, 0],
[ 106, 3, 139, ..., 0, 0, 0],
[1474, 1347, 1798, ..., 0, 0, 0],
...,
[ 544, 1339, 1, ..., 0, 0, 0],
[ 18, 950, 2701, ..., 0, 0, 0],
[ 125, 20, 450, ..., 0, 0, 0]])
embeddings_dictionary = dict()
embedding_dim = 100
# Load GloVe 100D embeddings
with open('./data/glove.6B.100d.txt', encoding="utf8") as fp:
for line in fp.readlines():
records = line.split()
word = records[0]
vector_dimensions = np.asarray(records[1:], dtype='float32')
embeddings_dictionary [word] = vector_dimensions
embedding_matrix = np.zeros((vocab_length, embedding_dim))
for word, index in word_tokenizer.word_index.items():
embedding_vector = embeddings_dictionary.get(word)
if embedding_vector is not None:
embedding_matrix[index] = embedding_vector
embedding_matrix
array([[ 0. , 0. , 0. , ..., 0. ,
0. , 0. ],
[-0.78276002, 0.80839002, 0.34307 , ..., 0.083831 ,
0.47064999, 0.59265 ],
[-0.031071 , -0.22633 , -0.18579 , ..., -0.063149 ,
-0.60852998, -0.24131 ],
...,
[ 0.19814 , -0.33517 , -0.13950001, ..., -0.070356 ,
-0.18391 , 0.62439001],
[-0.25161999, 0.73246998, 0.30792999, ..., -0.89315999,
1.29760003, -0.04898 ],
[-0.34132001, 0.26423001, 0.47813001, ..., -0.92395002,
0.48275 , 0.52947998]])
# split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(
train_padded_sentences,
train_target,
test_size=0.25)
# create model
def glove_lstm():
model = Sequential()
model.add(Embedding(
input_dim=embedding_matrix.shape[0],
output_dim=embedding_matrix.shape[1],
weights = [embedding_matrix],
input_length=length_long_sentence
))
model.add(Bidirectional(LSTM(
length_long_sentence,
return_sequences = True,
recurrent_dropout=0.2
)))
model.add(GlobalMaxPool1D())
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(length_long_sentence, activation = "relu"))
model.add(Dropout(0.5))
model.add(Dense(length_long_sentence, activation = "relu"))
model.add(Dropout(0.5))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
return model
model = glove_lstm()
model.summary()
Model: "sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding (Embedding) (None, 23, 100) 1443400 _________________________________________________________________ bidirectional (Bidirectional (None, 23, 46) 22816 _________________________________________________________________ global_max_pooling1d (Global (None, 46) 0 _________________________________________________________________ batch_normalization (BatchNo (None, 46) 184 _________________________________________________________________ dropout (Dropout) (None, 46) 0 _________________________________________________________________ dense (Dense) (None, 23) 1081 _________________________________________________________________ dropout_1 (Dropout) (None, 23) 0 _________________________________________________________________ dense_1 (Dense) (None, 23) 552 _________________________________________________________________ dropout_2 (Dropout) (None, 23) 0 _________________________________________________________________ dense_2 (Dense) (None, 1) 24 ================================================================= Total params: 1,468,057 Trainable params: 1,467,965 Non-trainable params: 92 _________________________________________________________________
# train model
model = glove_lstm()
checkpoint = ModelCheckpoint(
'model.h5',
monitor = 'val_loss',
verbose = 1,
save_best_only = True)
reduce_lr = ReduceLROnPlateau(
monitor = 'val_loss',
factor = 0.2,
verbose = 1,
patience = 5,
min_lr = 0.001)
history = model.fit(
X_train,
y_train,
epochs = 7,
batch_size = 32,
validation_data = (X_test, y_test),
verbose = 1,
callbacks = [reduce_lr, checkpoint])
Epoch 1/7 179/179 [==============================] - 25s 73ms/step - loss: 0.7341 - accuracy: 0.5915 - val_loss: 0.6538 - val_accuracy: 0.7332 Epoch 00001: val_loss improved from inf to 0.65376, saving model to model.h5 Epoch 2/7 179/179 [==============================] - 11s 64ms/step - loss: 0.6233 - accuracy: 0.6782 - val_loss: 0.5570 - val_accuracy: 0.7789 Epoch 00002: val_loss improved from 0.65376 to 0.55704, saving model to model.h5 Epoch 3/7 179/179 [==============================] - 11s 63ms/step - loss: 0.5562 - accuracy: 0.7325 - val_loss: 0.4628 - val_accuracy: 0.8067 Epoch 00003: val_loss improved from 0.55704 to 0.46278, saving model to model.h5 Epoch 4/7 179/179 [==============================] - 11s 63ms/step - loss: 0.5206 - accuracy: 0.7691 - val_loss: 0.4494 - val_accuracy: 0.8083 Epoch 00004: val_loss improved from 0.46278 to 0.44944, saving model to model.h5 Epoch 5/7 179/179 [==============================] - 11s 63ms/step - loss: 0.4893 - accuracy: 0.7937 - val_loss: 0.4494 - val_accuracy: 0.8099 Epoch 00005: val_loss improved from 0.44944 to 0.44943, saving model to model.h5 Epoch 6/7 179/179 [==============================] - 11s 63ms/step - loss: 0.4618 - accuracy: 0.8110 - val_loss: 0.4165 - val_accuracy: 0.8188 Epoch 00006: val_loss improved from 0.44943 to 0.41653, saving model to model.h5 Epoch 7/7 179/179 [==============================] - 11s 62ms/step - loss: 0.4443 - accuracy: 0.8227 - val_loss: 0.4227 - val_accuracy: 0.8225 Epoch 00007: val_loss did not improve from 0.41653
# accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend(['Train','Val'],loc='upper left')
plt.show()
# loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend(['Train','Val'],loc='upper left')
plt.show()
y_pred = (model.predict(X_test).ravel()>0.5)+0
lstm_cf1 = conf_matrix(metrics.confusion_matrix(y_test, y_pred))
lstm_cf1
df_lstm = pd.read_csv("./data/train.csv")
def features_text(text):
# convert to lower
text = text.lower()
# remove line breaks
text = re.sub(r'\n',' ',text)
# remove trailing spaces, tabs
text = re.sub('\s+',' ',text)
# remove punctuation
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
return text
# Apply clean function on random train string
df_lstm['text'] = df_lstm['text'].apply(features_text)
# Stopwords
df_lstm['text'] = df_lstm['text'].apply(remove_stopwords)
## Applying tokenization function on train sets
df_lstm['text'] = df_lstm['text'].map(tokenizer.tokenize)
# lemmatization
df_lstm['text'] =df_lstm['text'].apply(lambda x: lem_words(x))
# combine text
df_lstm['text'] =df_lstm['text'].apply(lambda x: combine_txt(x))
train_tweets1 = df_lstm['text'].values
test_tweets1 = df_lstm['text'].values
train_target1 = df_lstm['target'].values
# Calculate the length of our vocabulary
word_tokenizer.fit_on_texts(train_tweets1)
vocab_length = len(word_tokenizer.word_index) + 1
vocab_length
21053
train_tweets1
array(['deed reason earthquake may allah forgive u',
'forest fire near la ronge sask canada',
'resident asked shelter place notified officer evacuation shelter place order expected',
..., 'm194 0104 utc5km volcano hawaii httptcozdtoyd8ebj',
'police investigating ebike collided car little portugal ebike rider suffered serious nonlife threatening injury',
'latest home razed northern california wildfire abc news httptcoymy4rskq3d'],
dtype=object)
longest_train = max(train_tweets1, key=lambda sentence: len(word_tokenize(sentence)))
length_long_sentence = len(word_tokenize(longest_train))
train_padded_sentences = pad_sequences(
embed(train_tweets1),
length_long_sentence,
padding='post'
)
test_padded_sentences = pad_sequences(
embed(test_tweets1),
length_long_sentence,
padding='post'
)
train_padded_sentences
array([[ 4000, 450, 188, ..., 0, 0, 0],
[ 107, 3, 146, ..., 0, 0, 0],
[ 1537, 1407, 1881, ..., 0, 0, 0],
...,
[ 5749, 5946, 14879, ..., 0, 0, 0],
[ 18, 973, 2798, ..., 0, 0, 0],
[ 130, 20, 448, ..., 0, 0, 0]])
embeddings_dictionary = dict()
embedding_dim = 100
# Load GloVe 100D embeddings
with open('./data/glove.6B.100d.txt', encoding="utf8") as fp:
for line in fp.readlines():
records = line.split()
word = records[0]
vector_dimensions = np.asarray(records[1:], dtype='float32')
embeddings_dictionary [word] = vector_dimensions
embedding_matrix = np.zeros((vocab_length, embedding_dim))
for word, index in word_tokenizer.word_index.items():
embedding_vector = embeddings_dictionary.get(word)
if embedding_vector is not None:
embedding_matrix[index] = embedding_vector
embedding_matrix
array([[ 0. , 0. , 0. , ..., 0. ,
0. , 0. ],
[-0.78276002, 0.80839002, 0.34307 , ..., 0.083831 ,
0.47064999, 0.59265 ],
[-0.031071 , -0.22633 , -0.18579 , ..., -0.063149 ,
-0.60852998, -0.24131 ],
...,
[ 0. , 0. , 0. , ..., 0. ,
0. , 0. ],
[ 0. , 0. , 0. , ..., 0. ,
0. , 0. ],
[ 0. , 0. , 0. , ..., 0. ,
0. , 0. ]])
# split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(
train_padded_sentences,
train_target,
test_size=0.25)
# create model
def features_lstm():
model = Sequential()
model.add(Embedding(
input_dim=embedding_matrix.shape[0],
output_dim=embedding_matrix.shape[1],
weights = [embedding_matrix],
input_length=length_long_sentence
))
model.add(Bidirectional(LSTM(
length_long_sentence,
return_sequences = True,
recurrent_dropout=0.2
)))
model.add(GlobalMaxPool1D())
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(length_long_sentence, activation = "relu"))
model.add(Dropout(0.5))
model.add(Dense(length_long_sentence, activation = "relu"))
model.add(Dropout(0.5))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
return model
model = features_lstm()
model.summary()
Model: "sequential_2" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding_2 (Embedding) (None, 24, 100) 2105300 _________________________________________________________________ bidirectional_2 (Bidirection (None, 24, 48) 24000 _________________________________________________________________ global_max_pooling1d_2 (Glob (None, 48) 0 _________________________________________________________________ batch_normalization_2 (Batch (None, 48) 192 _________________________________________________________________ dropout_6 (Dropout) (None, 48) 0 _________________________________________________________________ dense_6 (Dense) (None, 24) 1176 _________________________________________________________________ dropout_7 (Dropout) (None, 24) 0 _________________________________________________________________ dense_7 (Dense) (None, 24) 600 _________________________________________________________________ dropout_8 (Dropout) (None, 24) 0 _________________________________________________________________ dense_8 (Dense) (None, 1) 25 ================================================================= Total params: 2,131,293 Trainable params: 2,131,197 Non-trainable params: 96 _________________________________________________________________
# train model
model = features_lstm()
checkpoint = ModelCheckpoint(
'model_f.h5',
monitor = 'val_loss',
verbose = 1,
save_best_only = True)
reduce_lr = ReduceLROnPlateau(
monitor = 'val_loss',
factor = 0.2,
verbose = 1,
patience = 5,
min_lr = 0.001)
history = model.fit(
X_train,
y_train,
epochs = 7,
batch_size = 32,
validation_data = (X_test, y_test),
verbose = 1,
callbacks = [reduce_lr, checkpoint])
Epoch 1/7 179/179 [==============================] - 25s 74ms/step - loss: 0.7191 - accuracy: 0.5871 - val_loss: 0.6388 - val_accuracy: 0.7258 Epoch 00001: val_loss improved from inf to 0.63877, saving model to model_f.h5 Epoch 2/7 179/179 [==============================] - 12s 66ms/step - loss: 0.5902 - accuracy: 0.7108 - val_loss: 0.5122 - val_accuracy: 0.7862 Epoch 00002: val_loss improved from 0.63877 to 0.51222, saving model to model_f.h5 Epoch 3/7 179/179 [==============================] - 12s 66ms/step - loss: 0.5207 - accuracy: 0.7646 - val_loss: 0.4684 - val_accuracy: 0.7899 Epoch 00003: val_loss improved from 0.51222 to 0.46837, saving model to model_f.h5 Epoch 4/7 179/179 [==============================] - 12s 66ms/step - loss: 0.4762 - accuracy: 0.7949 - val_loss: 0.4541 - val_accuracy: 0.7952 Epoch 00004: val_loss improved from 0.46837 to 0.45411, saving model to model_f.h5 Epoch 5/7 179/179 [==============================] - 12s 66ms/step - loss: 0.4401 - accuracy: 0.8187 - val_loss: 0.4599 - val_accuracy: 0.7962 Epoch 00005: val_loss did not improve from 0.45411 Epoch 6/7 179/179 [==============================] - 12s 66ms/step - loss: 0.4107 - accuracy: 0.8369 - val_loss: 0.4708 - val_accuracy: 0.7983 Epoch 00006: val_loss did not improve from 0.45411 Epoch 7/7 179/179 [==============================] - 12s 66ms/step - loss: 0.3992 - accuracy: 0.8469 - val_loss: 0.4915 - val_accuracy: 0.7978 Epoch 00007: val_loss did not improve from 0.45411
y_pred = (model.predict(X_test).ravel()>0.5)+0
conf_matrix(metrics.confusion_matrix(y_test, y_pred))